{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/felipemaiapolo/zelenskyy_speeches/blob/main/extracting_sentiments.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "3e85fbe6",
      "metadata": {
        "id": "3e85fbe6"
      },
      "source": [
        "# Extracting sentiments from texts"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "%%capture\n",
        "!pip install --upgrade --force-reinstall numpy=='1.20.3' pandas=='1.3.3' gensim=='4.2.0' wordcloud=='1.8.2.2' tqdm=='4.62.2' fsspec=='2022.5.0'"
      ],
      "metadata": {
        "id": "bjCREQxuJaKd"
      },
      "id": "bjCREQxuJaKd",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "id": "fb94ed96",
      "metadata": {
        "id": "fb94ed96"
      },
      "source": [
        "## Setting-up"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "d0521cb7",
      "metadata": {
        "id": "d0521cb7"
      },
      "source": [
        "Loading packages (maybe you will need to restart the notebook after installing packages):"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "7965951d",
      "metadata": {
        "id": "7965951d"
      },
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "import numpy as np\n",
        "from tqdm import tqdm\n",
        "import gensim.downloader as api\n",
        "from wordcloud import WordCloud\n",
        "import string\n",
        "import json"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "845bd9c8",
      "metadata": {
        "id": "845bd9c8"
      },
      "source": [
        "Some useful functions:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "f9ee38bd",
      "metadata": {
        "id": "f9ee38bd"
      },
      "outputs": [],
      "source": [
        "#Tokenizer\n",
        "def tokenize(txt):\n",
        "    txt=txt.lower()\n",
        "    txt=txt.replace('’s','')\n",
        "    txt=txt.replace('“','')\n",
        "    txt=txt.replace('”','')\n",
        "    txt=txt.replace('...','')\n",
        "    txt=txt.translate(str.maketrans('', '', string.punctuation))\n",
        "    txt=txt.split(' ')\n",
        "    return txt\n",
        "\n",
        "#Function that get the sentiment form a specific word\n",
        "def get_sentiment(word, pos, neg):\n",
        "    return np.sum([model.similarity(word, p) for p in pos])-np.sum([model.similarity(word, n) for n in neg])"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "fe4bcb4a",
      "metadata": {
        "id": "fe4bcb4a"
      },
      "source": [
        "Defining the lists of positive and negative words:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "137d7397",
      "metadata": {
        "id": "137d7397"
      },
      "outputs": [],
      "source": [
        "pos=['good', 'excellent', 'correct', 'best', 'happy', 'positive', 'fortunate']     \n",
        "neg=['bad','terrible', 'wrong', 'worst', 'disappointed', 'negative', 'unfortunate']"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "67ad5bcb",
      "metadata": {
        "id": "67ad5bcb"
      },
      "source": [
        "## Extracting sentiments from speeches"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "3b5c70cb",
      "metadata": {
        "id": "3b5c70cb"
      },
      "source": [
        "Loading data"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "2a5a876b",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 354
        },
        "id": "2a5a876b",
        "outputId": "64841031-1f8a-4efe-b875-09b8ea122ddc"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "   Unnamed: 0                                                url  \\\n",
              "0           0  https://www.president.gov.ua/en/news/den-ukray...   \n",
              "1           1  https://www.president.gov.ua/en/news/zsu-krok-...   \n",
              "2           2  https://www.president.gov.ua/en/news/vijna-ne-...   \n",
              "3           3  https://www.president.gov.ua/en/news/rosiya-ro...   \n",
              "4           4  https://www.president.gov.ua/en/news/mayemo-is...   \n",
              "\n",
              "                 date                                              title  \\\n",
              "0 2022-07-24 22:02:00  The Day of Ukrainian Statehood on July 28 will...   \n",
              "1 2022-07-23 23:42:00  Armed Forces of Ukraine advancing step by step...   \n",
              "2 2022-07-23 19:04:00  The war did not break Ukraine and will not bre...   \n",
              "3 2022-07-22 22:28:00  Russia did everything to destroy Ukraine\" s=\"\"...   \n",
              "4 2022-07-21 22:25:00  We have a significant potential for the advanc...   \n",
              "\n",
              "                                                text  \n",
              "0  Good health to you, fellow Ukrainians! An impo...  \n",
              "1  Dear Ukrainian men and women! The one hundred ...  \n",
              "2  Dear First Ladies and Gentlemen! Dear Ladies a...  \n",
              "3  Dear Ukrainian men and women! Dear all our def...  \n",
              "4  Good health to you, fellow Ukrainians! Today, ...  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-973b9b79-6133-42d6-ab7c-63ea69d0bdc2\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Unnamed: 0</th>\n",
              "      <th>url</th>\n",
              "      <th>date</th>\n",
              "      <th>title</th>\n",
              "      <th>text</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>0</td>\n",
              "      <td>https://www.president.gov.ua/en/news/den-ukray...</td>\n",
              "      <td>2022-07-24 22:02:00</td>\n",
              "      <td>The Day of Ukrainian Statehood on July 28 will...</td>\n",
              "      <td>Good health to you, fellow Ukrainians! An impo...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1</td>\n",
              "      <td>https://www.president.gov.ua/en/news/zsu-krok-...</td>\n",
              "      <td>2022-07-23 23:42:00</td>\n",
              "      <td>Armed Forces of Ukraine advancing step by step...</td>\n",
              "      <td>Dear Ukrainian men and women! The one hundred ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>2</td>\n",
              "      <td>https://www.president.gov.ua/en/news/vijna-ne-...</td>\n",
              "      <td>2022-07-23 19:04:00</td>\n",
              "      <td>The war did not break Ukraine and will not bre...</td>\n",
              "      <td>Dear First Ladies and Gentlemen! Dear Ladies a...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>3</td>\n",
              "      <td>https://www.president.gov.ua/en/news/rosiya-ro...</td>\n",
              "      <td>2022-07-22 22:28:00</td>\n",
              "      <td>Russia did everything to destroy Ukraine\" s=\"\"...</td>\n",
              "      <td>Dear Ukrainian men and women! Dear all our def...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>4</td>\n",
              "      <td>https://www.president.gov.ua/en/news/mayemo-is...</td>\n",
              "      <td>2022-07-21 22:25:00</td>\n",
              "      <td>We have a significant potential for the advanc...</td>\n",
              "      <td>Good health to you, fellow Ukrainians! Today, ...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-973b9b79-6133-42d6-ab7c-63ea69d0bdc2')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-973b9b79-6133-42d6-ab7c-63ea69d0bdc2 button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-973b9b79-6133-42d6-ab7c-63ea69d0bdc2');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ]
          },
          "metadata": {},
          "execution_count": 5
        }
      ],
      "source": [
        "sp = pd.read_csv('https://github.com/felipemaiapolo/zelenskyy_speeches/raw/main/data/zelensky_speeches.csv')\n",
        "sp.date = pd.to_datetime(sp.date)\n",
        "sp.head()"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "db5af538",
      "metadata": {
        "id": "db5af538"
      },
      "source": [
        "Checking embeddings models available in Gensim:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "3eca1aaf",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "3eca1aaf",
        "outputId": "e0d5ce3a-83c9-4164-b48d-c59dbef4ce00"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "__testing_word2vec-matrix-synopsis (-1 records): [THIS IS ONLY FOR TESTING] Word vecrors of the movie matrix.\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "conceptnet-numberbatch-17-06-300 (1917247 records): ConceptNet Numberbatch consists of state-of-the-art semantic vectors (also known as word embeddings) that can be used directly as a representation of word meanings or as a starting point for further machine learning. ConceptNet Numberbatch is part of the ConceptNet open data project. ConceptNet provides lots of ways to compute with word meanings, one of which is word embeddings. ConceptNet Numberbatch is a snapshot of just the word embeddings. It is built using an ensemble that combines data from ConceptNet, word2vec, GloVe, and OpenSubtitles 2016, using a variation on retrofitting.\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "fasttext-wiki-news-subwords-300 (999999 records): 1 million word vectors trained on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens).\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "glove-twitter-100 (1193514 records): Pre-trained vectors based on  2B tweets, 27B tokens, 1.2M vocab, uncased (https://nlp.stanford.edu/projects/glove/)\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "glove-twitter-200 (1193514 records): Pre-trained vectors based on 2B tweets, 27B tokens, 1.2M vocab, uncased (https://nlp.stanford.edu/projects/glove/).\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "glove-twitter-25 (1193514 records): Pre-trained vectors based on 2B tweets, 27B tokens, 1.2M vocab, uncased (https://nlp.stanford.edu/projects/glove/).\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "glove-twitter-50 (1193514 records): Pre-trained vectors based on 2B tweets, 27B tokens, 1.2M vocab, uncased (https://nlp.stanford.edu/projects/glove/)\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "glove-wiki-gigaword-100 (400000 records): Pre-trained vectors based on Wikipedia 2014 + Gigaword 5.6B tokens, 400K vocab, uncased (https://nlp.stanford.edu/projects/glove/).\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "glove-wiki-gigaword-200 (400000 records): Pre-trained vectors based on Wikipedia 2014 + Gigaword, 5.6B tokens, 400K vocab, uncased (https://nlp.stanford.edu/projects/glove/).\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "glove-wiki-gigaword-300 (400000 records): Pre-trained vectors based on Wikipedia 2014 + Gigaword, 5.6B tokens, 400K vocab, uncased (https://nlp.stanford.edu/projects/glove/).\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "glove-wiki-gigaword-50 (400000 records): Pre-trained vectors based on Wikipedia 2014 + Gigaword, 5.6B tokens, 400K vocab, uncased (https://nlp.stanford.edu/projects/glove/).\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "word2vec-google-news-300 (3000000 records): Pre-trained vectors trained on a part of the Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases. The phrases were obtained using a simple data-driven approach described in 'Distributed Representations of Words and Phrases and their Compositionality' (https://code.google.com/archive/p/word2vec/).\n",
            "\n",
            "\n",
            "\n",
            "\n",
            "word2vec-ruscorpora-300 (184973 records): Word2vec Continuous Skipgram vectors trained on full Russian National Corpus (about 250M words). The model contains 185K words.\n",
            "\n",
            "\n",
            "\n",
            "\n"
          ]
        }
      ],
      "source": [
        "info = api.info()\n",
        "for model_name, model_data in sorted(info['models'].items()):\n",
        "    print(\n",
        "        '%s (%d records): %s' % (\n",
        "            model_name,\n",
        "            model_data.get('num_records', -1),\n",
        "            model_data['description'][:],\n",
        "        )\n",
        "    )\n",
        "    print('\\n\\n\\n')"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "8b144ac6",
      "metadata": {
        "id": "8b144ac6"
      },
      "source": [
        "We are going to load **\"glove-wiki-gigaword-300\"**:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "2e73b388",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "2e73b388",
        "outputId": "e6c89c0b-404e-4196-8984-00454752df48"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "[==================================================] 100.0% 376.1/376.1MB downloaded\n"
          ]
        }
      ],
      "source": [
        "model = api.load(\"glove-wiki-gigaword-300\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "457ae858",
      "metadata": {
        "id": "457ae858"
      },
      "source": [
        "Getting the model vocab.:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d4e10f5d",
      "metadata": {
        "id": "d4e10f5d"
      },
      "outputs": [],
      "source": [
        "vocab_model=list(set(model.key_to_index))"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "e600bc08",
      "metadata": {
        "id": "e600bc08"
      },
      "source": [
        "Getting the vocab. used in the texts:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "7369f02a",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "7369f02a",
        "outputId": "1824a117-db81-48d3-fd1c-98b4f802e8c7"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "100%|██████████| 273/273 [00:00<00:00, 1820.29it/s]\n"
          ]
        }
      ],
      "source": [
        "vocab=[]\n",
        "\n",
        "for txt in tqdm(sp.text.tolist()):\n",
        "    vocab+=tokenize(txt)\n",
        "\n",
        "vocab=set(vocab)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "a3a7c1ea",
      "metadata": {
        "id": "a3a7c1ea"
      },
      "source": [
        "Now, we get the sentiments for every word in `vocab` if that word is in `vocab_model`:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "ade50794",
      "metadata": {
        "scrolled": true,
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ade50794",
        "outputId": "5b996848-9d1e-4e20-be66-6c96f95d643f"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "100%|██████████| 10363/10363 [03:36<00:00, 47.86it/s]\n",
            "100%|██████████| 10363/10363 [03:31<00:00, 49.05it/s]\n"
          ]
        }
      ],
      "source": [
        "sentiments=[get_sentiment(w, pos, neg) for w in tqdm(vocab) if w in vocab_model]\n",
        "vocab=[w for w in tqdm(vocab) if w in vocab_model]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "6b09d2a8",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "6b09d2a8",
        "outputId": "0962e8e8-3202-4bab-d8bd-235847ad15d5"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(9477, 9477)"
            ]
          },
          "metadata": {},
          "execution_count": 11
        }
      ],
      "source": [
        "len(vocab), len(sentiments)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "740848ad",
      "metadata": {
        "id": "740848ad"
      },
      "source": [
        "Observing the most negative and positive words in `vocab`:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "7f2d613f",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "7f2d613f",
        "outputId": "4a71e18b-e895-4d9b-a3f9-2c604b0595e2"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "[['horrible', -1.909968],\n",
              " ['terrible', -1.864738],\n",
              " ['horrific', -1.7512368],\n",
              " ['senseless', -1.6989087],\n",
              " ['shameful', -1.6725583],\n",
              " ['blaming', -1.6478922],\n",
              " ['appalling', -1.6252333],\n",
              " ['horrendous', -1.6073263],\n",
              " ['blamed', -1.520782],\n",
              " ['caused', -1.4948518],\n",
              " ['consequences', -1.494211],\n",
              " ['dreadful', -1.4655025],\n",
              " ['worst', -1.463681],\n",
              " ['catastrophic', -1.4476106],\n",
              " ['inaction', -1.4466158],\n",
              " ['shocking', -1.4452888],\n",
              " ['worse', -1.4109223],\n",
              " ['ugly', -1.392833],\n",
              " ['disgusting', -1.3827171],\n",
              " ['vile', -1.3789988]]"
            ]
          },
          "metadata": {},
          "execution_count": 12
        }
      ],
      "source": [
        "K=20\n",
        "\n",
        "[[vocab[k],sentiments[k]] for k in np.argsort(sentiments)[:K]]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "f0d9fab3",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "f0d9fab3",
        "outputId": "1218e45c-04a8-4736-f74d-fc6f1d647c09"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "[['lively', 1.1587266],\n",
              " ['diligent', 1.1587312],\n",
              " ['cultivate', 1.161393],\n",
              " ['outstanding', 1.1703495],\n",
              " ['maintain', 1.1762311],\n",
              " ['harmonious', 1.1765563],\n",
              " ['reliable', 1.1805874],\n",
              " ['energetic', 1.1925653],\n",
              " ['flexible', 1.1978438],\n",
              " ['perfect', 1.1993508],\n",
              " ['provides', 1.2047523],\n",
              " ['provide', 1.2438107],\n",
              " ['guide', 1.2438293],\n",
              " ['enjoy', 1.2594359],\n",
              " ['stable', 1.2601318],\n",
              " ['healthy', 1.261559],\n",
              " ['solid', 1.2723947],\n",
              " ['innovative', 1.285939],\n",
              " ['good', 1.3519733],\n",
              " ['best', 1.4320018]]"
            ]
          },
          "metadata": {},
          "execution_count": 13
        }
      ],
      "source": [
        "[[vocab[k],sentiments[k]] for k in np.argsort(sentiments)[-K:]]"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "583a58fb",
      "metadata": {
        "id": "583a58fb"
      },
      "source": [
        "Creating a dictionary containing the sentiments of all words in `vocab` that are also in `vocab_model`:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "1acf2e3b",
      "metadata": {
        "id": "1acf2e3b"
      },
      "outputs": [],
      "source": [
        "ws={}\n",
        "for i in range(len(vocab)):\n",
        "    ws[vocab[i]] = sentiments[i]"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "0da4bac3",
      "metadata": {
        "id": "0da4bac3"
      },
      "source": [
        "Tokenizing speeches:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "90f23489",
      "metadata": {
        "id": "90f23489"
      },
      "outputs": [],
      "source": [
        "texts=[]\n",
        "for txt in sp.text.tolist():\n",
        "    texts.append(tokenize(txt))"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "5f33bbfc",
      "metadata": {
        "id": "5f33bbfc"
      },
      "source": [
        "Calculating sentiment for every speech:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "8f5e7583",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "8f5e7583",
        "outputId": "4a86ade5-3d40-4e5a-b38a-9ae2fb3d2407"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "100%|██████████| 273/273 [1:30:57<00:00, 19.99s/it]\n"
          ]
        }
      ],
      "source": [
        "texts_sent=[np.mean([ws[w] for w in txt if w in vocab_model]) for txt in tqdm(texts)]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "cc3b7cee",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "cc3b7cee",
        "outputId": "286dc563-0f95-48ae-e32a-96613c9ae49d"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "273"
            ]
          },
          "metadata": {},
          "execution_count": 17
        }
      ],
      "source": [
        "len(texts_sent)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "d63c908f",
      "metadata": {
        "id": "d63c908f"
      },
      "source": [
        "Creating dataset:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "45cafa91",
      "metadata": {
        "id": "45cafa91"
      },
      "outputs": [],
      "source": [
        "sp['sent']=pd.Series(texts_sent)\n",
        "sp=sp.loc[:,['date', 'title', 'text', 'url', 'sent']]"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "17d656c1",
      "metadata": {
        "id": "17d656c1"
      },
      "source": [
        "Saving data:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "6fc09581",
      "metadata": {
        "id": "6fc09581"
      },
      "outputs": [],
      "source": [
        "!mkdir data\n",
        "sp.to_csv('data/texts_sent.csv', index=False)"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        ""
      ],
      "metadata": {
        "id": "UydSE-uBJrk8"
      },
      "id": "UydSE-uBJrk8",
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3 (ipykernel)",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.7"
    },
    "colab": {
      "name": "extracting_sentiments.ipynb",
      "provenance": [],
      "collapsed_sections": [],
      "include_colab_link": true
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}